title: “Visualization using various R packages” knit: (function(input_file, encoding) { out_dir<- ‘docs’; rmarkdown::render(input_file, encoding=encoding, output_file=file.path(dirname(input_file), out_dir, ‘index.html’))}) output: html_document — —

suppressWarnings(library("readxl"))
suppressWarnings(library(curatedOvarianData))
## Loading required package: affy
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which.max, which.min
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
 df1 <- metadata <- read_excel("/Users/ana/Documents/independent study/new.xls",col_names=T)
## New names:
## • `` -> `...27`
## • `` -> `...28`
## • `` -> `...29`
## • `` -> `...30`
 data(TCGA_eset)
suppressWarnings(library(curatedOvarianData))
suppressWarnings(library(pheatmap))

# Extract gene expression data
exprs_matrix <- exprs(TCGA_eset)
rownames(exprs_matrix) <- featureNames(TCGA_eset)


subtypes <- df1 $SUBTYPE

# Subtypes to analyze
target_subtypes <- c("Immunoreactive", "Proliferative", "Differentiated", "Mesenchymal")

# Loop over each subtype
for (subtype in target_subtypes) {
    # Subset the data for each subtype
    subtype_indices <- which(subtypes == subtype)
    exprs_matrix_subtype <- exprs_matrix[, subtype_indices]
exprs_matrix_subtype <- exprs_matrix_subtype[!is.na(rowSums(exprs_matrix_subtype)), ]

# Using complete.cases
    # Subset to top 20 genes that have a relatively high variance
    exprs_matrix_subtype <- exprs_matrix_subtype[apply(exprs_matrix_subtype, 1, function(x) var(x) > 0.5), ][20:40, ]
    

    # Create heatmap
    pheatmap(exprs_matrix_subtype, scale = "row", clustering_distance_rows = "euclidean", 
             show_rownames = T, show_colnames=F,
             main = paste("Heatmap for", subtype)) 
}

suppressWarnings(library(igraph))
## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:BiocGenerics':
## 
##     normalize, path, union
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
data(TCGA_eset)
exprs_matrix <- exprs(TCGA_eset)
subtypes <- df1 $SUBTYPE

target_subtypes <- c("Immunoreactive", "Proliferative", "Differentiated", "Mesenchymal")



# Loop over each subtype
for (subtype in target_subtypes) {
    # Subset the data for each subtype
    subtype_indices <- which(subtypes == subtype)
    tcga.exprs_subtype <- exprs_matrix[, subtype_indices]

    # Log2 transformation
    log2.exprs <- log2(tcga.exprs_subtype + 1)

    # Calculate the variance for each gene
    gene.variance <- apply(log2.exprs, 1, var)

    # Order genes by variance and select the top 100
    top.genes <- head(order(gene.variance, decreasing = TRUE), 5)

    # Subset the gene expression data with the top 50 genes
    subset.exprs <- log2.exprs[top.genes, ]

    # Generate a correlation matrix of the gene expression data
    cor.matrix <- cor(subset.exprs)
threshold <- 0.7  # Adjust this value according to your data and needs

    # Filter the correlation matrix
    cor.matrix[abs(cor.matrix) < threshold] <- 0

    # Create an igraph graph object
    graph <- graph.adjacency(cor.matrix, mode = "undirected", weighted = TRUE)
    
    # Remove self-loops
    graph <- simplify(graph, remove.multiple = FALSE, remove.loops = TRUE)

    # Set node color based on gene expression level
    node.color <- ifelse(rowMeans(subset.exprs) > log2(6), "red", "blue")

    # Plot the graph with node color based on gene expression level
    plot(graph, vertex.size =8, vertex.color = node.color,vertex.label.cex = 0.7, edge.width = E(graph)$weight*0.5,
         main = paste("Graph for", subtype)) # title for each graph
}
## Warning in v(graph): Non-positive edge weight found, ignoring all weights
## during graph layout.

## Warning in v(graph): Non-positive edge weight found, ignoring all weights
## during graph layout.

## Warning in v(graph): Non-positive edge weight found, ignoring all weights
## during graph layout.

## Warning in v(graph): Non-positive edge weight found, ignoring all weights
## during graph layout.

suppressWarnings(library(circlize))
## ========================================
## circlize version 0.4.15
## CRAN page: https://cran.r-project.org/package=circlize
## Github page: https://github.com/jokergoo/circlize
## Documentation: https://jokergoo.github.io/circlize_book/book/
## 
## If you use it in published research, please cite:
## Gu, Z. circlize implements and enhances circular visualization
##   in R. Bioinformatics 2014.
## 
## This message can be suppressed by:
##   suppressPackageStartupMessages(library(circlize))
## ========================================
## 
## Attaching package: 'circlize'
## The following object is masked from 'package:igraph':
## 
##     degree
# Extract the gene expression data
tcga.exprs <- exprs(TCGA_eset)

# Assume the subtype information is stored in a variable named "subtype" in pData
subtypes <- df1 $SUBTYPE

# Subtypes to analyze
target_subtypes <- c("Immunoreactive", "Proliferative", "Differentiated", "Mesenchymal")

# Loop over each subtype
for (subtype in target_subtypes) {
    # Subset the data for each subtype
    subtype_indices <- which(subtypes == subtype)
    tcga.exprs_subtype <- tcga.exprs[, subtype_indices]

    # Calculate the variance for each gene
    gene.variance <- apply(tcga.exprs_subtype, 1, var)

    # Order genes by variance and select the top 20
    top_genes <- head(order(gene.variance, decreasing = TRUE), 20)

    # Subset the gene expression data with the top 20 genes
    subset_exprs <- tcga.exprs_subtype[top_genes, ]

    # Calculate the correlation matrix of the gene expression data
    correlation_matrix <- cor(t(subset_exprs))

    grid.col <- setNames(rainbow(length(top_genes)), top_genes)

    # Plot
    par(mfrow = c(1, 1))
    chordDiagram(correlation_matrix, annotationTrack = "grid", 
                 preAllocateTracks = 1, 
                 grid.col = grid.col,
                 directional = 1, 
                 direction.type = c("diffHeight", "arrows"), 
                 link.arr.type = "big.arrow")

    circos.trackPlotRegion(track.index = 1, panel.fun = function(x, y) {
      xlim = get.cell.meta.data("xlim")
      ylim = get.cell.meta.data("ylim")
      sector.name = get.cell.meta.data("sector.index")
      circos.text(CELL_META$xcenter, 
                  ylim[1] + cm_h(2), 
                  sector.name, 
                  facing = "clockwise",
                  niceFacing = TRUE, 
                  adj = c(0, 0.5),
                  cex = .5,
                  col=grid.col[sector.name],
                  font = 1)
      circos.axis(h = "bottom",
                  labels.cex = .1,
                  sector.index = sector.name
                  )
    }, bg.border = NA)
    title(main = paste("Chord Diagram for", subtype), line = 2)
}

suppressWarnings(library(networkD3))
# Extract the gene expression data
tcga.exprs <- exprs(TCGA_eset)

# Assume the subtype information is stored in a variable named "subtype" in pData
subtypes <- df1 $SUBTYPE

# Subtypes to analyze
target_subtypes <- c("Immunoreactive", "Proliferative", "Differentiated", "Mesenchymal")

# Loop over each subtype
for (subtype in target_subtypes) {
  # Subset the data for each subtype
  subtype_indices <- which(subtypes == subtype)
  tcga.exprs_subtype <- tcga.exprs[, subtype_indices]

  # Calculate the variance for each gene
  gene.variance <- apply(tcga.exprs_subtype, 1, var)

  # Get the indices of the top 20 genes with the highest variance
  top_genes <- head(order(gene.variance, decreasing = TRUE), 20)

  # Subset the gene expression data with the top 20 genes
  subset_exprs <- tcga.exprs_subtype[top_genes, ]

  # Calculate the correlation matrix of the gene expression data
  cor_mat <- cor(subset_exprs)

  # Convert correlation matrix to edge list
  edge_list <- as.data.frame(as.table(cor_mat))
  colnames(edge_list) <- c("source", "target", "weight")

  # Create the simple network
  print(paste("Network for", subtype))
  simpleNetwork(edge_list,linkColour = "blue")
}
## [1] "Network for Immunoreactive"
## [1] "Network for Proliferative"
## [1] "Network for Differentiated"
## [1] "Network for Mesenchymal"